import os
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import feather
import plotly.express as px
import psutil
import libpysal as lp
import esda
import rasterio as rio
import contextily as ctx
import shapely.geometry as geom
import mapclassify as mc
%matplotlib inline
os.chdir('..')
os.chdir('data')
os.getcwd()
In this notebook, we will conduct some exploratory spatial data analysis on the afgifte dataset. Unlike the previous general ESDA (see afgifte_esda), we will explore 10 dependent variables: log(kg) of secondary resources received in each postcode categorized by the top 10 materials by weight. The material categorizations have been refined - instead of using the ewc code, which represents industrial categories, we are using a esv code, which represents materials more. In this notebook, we will do:
Here we prepare a geodataframe for the ESDA. The dataframe should contain the following columns:
# clean df
df = feather.read_dataframe('lma/af_2019-2020_matched-codes.feather')
df = df[['eaPostcode', 'gncChap', 'esvChap', 'kg']]
# change codes to chapters
df.eaPostcode = df.eaPostcode.str[:4]
# material column
def mat(row):
if row.gncChap == '--':
mat = 'EWC-' + row.esvChap
else:
mat = 'GNC-' + row.gncChap
return mat
df['mat'] = df.apply(lambda row: mat(row), axis=1)
# groupby postcode, materials
df = df.groupby(['eaPostcode', 'mat']).sum().reset_index()
# merge with postcode-4 shape file of nl
# read pc4 shpfile
pc4 = gpd.read_file('spatial-data/nl_pc4.shp')
pc4 = pc4[['PC4', 'geometry']]
# merge with pc4
df = pd.merge(df, pc4, how='left', left_on='eaPostcode', right_on='PC4')
df = gpd.GeoDataFrame(df)
# CALCULATE LOG VALUES
def log(x, logType):
if x > 0:
return np.log(x)
else:
return 0
df['logn'] = df.kg.map(lambda x: log(x, logType='logn'))
# display
df.drop(labels=['eaPostcode'], axis=1, inplace=True)
df = df[['PC4', 'mat', 'kg', 'logn', 'geometry']]
df.sort_values('logn', ascending=False, inplace=True)
print(df.shape)
df.head()
# identify top n materials by weight
n = 10
topMat = df.copy()
topMat = topMat.groupby('mat').sum().kg.reset_index().sort_values('kg', ascending=False).head(n)
# calculate % gnc and ewc codes
gncKg = topMat[topMat.mat.str.startswith('GNC')].kg.sum()
ewcKg = topMat[topMat.mat.str.startswith('EWC')].kg.sum()
totKg = topMat.kg.sum()
percGnc = round(gncKg/totKg*100, 2)
percEwc = round(ewcKg/totKg*100, 2)
# pie chart
fig = px.pie(topMat, values='kg', names='mat', title='Top {} materials by weight'.format(n))
fig.show()
# make lists for topMat, topGnc, and topEwc
topMat = list(topMat.mat)
top5 = topMat[:5]
top10 = topMat[5:]
print('top {} materials by weight: {}'.format(n, topMat))
print('% gnc by weight: {}\n% ewc by weight: {}'.format(percGnc, percEwc))
# separate dfs for each material type in topMat
matFlows = dict.fromkeys(topMat, None)
# for each type of material flow, make gdf with all postcodes, where column nlog = log(kg) received by industry v
for i in topMat:
# make gdf
temp = df[df.mat.isin([i])][['PC4', 'mat', 'kg', 'logn']]
temp = pd.merge(pc4, temp, how='left', on='PC4') # ensure that lma data has same shape of pc4 shpfile
temp.mat = temp.mat.fillna('--')
temp.kg = temp.kg.fillna(0)
temp.logn = temp.logn.fillna(0)
# add temp gdf to list of matFlows
matFlows[i] = temp
Here we take a first glance of the data by creating histograms. The three figures below show the distribution of values (kg received) throughout the dataset for each type of industry flow. This step doesn't say anything about the spatial relationships - we will explore in the rest of this notebook.
for top in [top5, top10]:
fig, ax = plt.subplots(2,5, figsize=(30,10))
# original values (not log transformations)
for i, v in enumerate(top):
temp = matFlows[v] # make temp df
temp = temp[temp.kg != 0]
sn.histplot(data=temp, x="kg", bins=30, ax=ax[0,i])
ax[0,i].set_title('# postcodes for {}'.format(v))
for i, v in enumerate(top):
temp = matFlows[v] # make temp df
temp = temp[temp.kg != 0]
sn.histplot(data=temp, x="logn", bins=30, ax=ax[1,i])
plt.show()
Like the total kg received, the kg received per industry also follows a log distribution, meaning that there are lots of postcodes receiving a low kg of waste, and a few postcodes receiving high amounts of waste. However, after log-transformation, the manufacturing industry doesn't seem to follow a normal distribution very well. This could be because there aren't a lot of postcodes associated wtih manufacturing industry in the first place. The construction and agriculture industry seems to follow a normal distribution pretty well.
I'm also trying to understand whether to choose a log transformation of base 10 or base e (natural log). Both log transformation result in the exact same shape, but the scale on the x-axis is different. I'm still trying to understand why that is the case.
Here we map out the kg and log(kg) of secondary resources received for each industry type. Some observations of the maps:
# make pc2 geodataframe to speed up plotting process
pc2 = gpd.read_file('spatial-data/nl_pc2.shp')
for top in [top5, top10]:
fig, ax = plt.subplots(1,5, figsize=(8*5,7))
for i, v in enumerate(top):
temp = matFlows[v]
temp = temp[temp.kg != 0] # remove zero values
pc2.plot(ax=ax[i], color='#f0f0f0')
temp.plot(ax=ax[i], column='logn', cmap='OrRd', legend=True)
ax[i].axis('off')
ax[i].set_title(v + ' (log values)')
plt.show()
Another aspect of the data that we need to consider are the zero values. 60% of postcodes in the Netherlands don't receive any secondary resources - this is highlighted in red in the map below. That is a lot of datapoints that will skew the data - so we need to think about whether to include the postcodes with zero kg received in our analysis later. For now, I've kept the zero values, because I think it's important to take these postcodes into account.
for top in [top5, top10]:
fig, ax = plt.subplots(1,5, figsize=(5*5,8))
for i, v in enumerate(top):
temp = matFlows[v]
temp = temp[temp.kg == 0] # pick out zero values
pc2.plot(ax=ax[i], color='#f0f0f0')
temp.plot(ax=ax[i], color='red')
ax[i].axis('off')
percZero = round(len(temp)/len(pc4)*100, 2)
ax[i].set_title('{} zero values ({}%)'.format(v, percZero))
plt.show()
Before conducting any spatial analysis, it's important to find out whether the data has any relationship to space - does the data follow any kind of spatial pattern, or are the values just randomly distributed through space? Although we can answer this question from just looking at the map and trying to detect patterns with our eyes, we can also use spatial statistics to get a more accurate understanding of the data.
In order to do this, we look at the dataset and ask the question: 'If we threw values (kg) randomly on the map in simulations, how likely would these simulations resemble our actual data?' or in other words, 'what is the probability that our data (on kg received per pc) is randomly distributed in space?'
If we find that our data is not randomly distributed in space, we can assume that our data is somehow affected by geographical space, and proceed with our spatial analysis. However, if we find that our data is randomly distributed in space, there is not point in conducting further spatial analysis. In this case, we will have to further separate the data (by material type, for example) until we get a dataset that isn't randomly distributed in space.
Before we answer the questions above, we first need to find out for each postcode who its neighbors are, and define what we mean by 'neighbors'. For our analysis, we define neighbors as any postcodes that share an edge or a vertex (point). This definition is called a 'queen spatial weight', in reference to the queen chess piece.
Then, we need to describe, for each postcode, the conditions of its neighborhood. We do this by using a process of calculation called spatial lag. Here, for each postcode, we identify its neighbors, then take an average of all their values. This average neighbors' value is then attributed to the postcode. You can see this process in the maps below - the map on the left shows the actual kg received for each post code. The map on the right shows the average kg received of each postcode by its neighbors (and not itself).
This allows us to understand whether the postcodes in our data have relationship to its spatial neighbors. For example, do postcodes with high values tend have neighbors with high values as well? Or is it just random, where some high value postcodes have high value neighbors, while others don't?
# calculate spatial weights of NL (they are the same for all industry flows, since they are all in NL)
wq = lp.weights.Queen.from_dataframe(pc4) # queen spatial weights - for each pc, identify neighbors (other pcs that share a vertex or edge)
wq.transform = 'r' # weights get row normalized, so when you sum the row, it equals 1
As seen above, the pc4 shpfile has two islands. These postcodes don't have direct neighbors (aka don't share an edge or vertex with another postcode). This will pose problems further down the notebook - calculating local moran's I requires all elements to have at least one neighbor. For now, I solve the problem by just dropping these two postcodes just before I do the Local Moran's I calculations, but there are better solutions:
# calculating spatial lag for each material type
for i in topMat:
temp = matFlows[i].copy()
# calculate spatial lag of kg (NOT logn(kg))
y = temp['kg']
ylag = lp.weights.lag_spatial(wq, y)
temp['lag_logn'] = ylag
# natural log transform spatial lag
def log(x):
if x == 0:
return 0
else:
return np.log(x)
temp.lag_logn = temp.lag_logn.map(lambda x: log(x))
# assign new df to matFlows dictionary
matFlows[i] = temp
important: you cannot directly calculate the average of log values. You have to calculate the average of kg values, then log transform that average. otherwise it is not meaningful. It would be good to check with Alexis (statistics prof) with this.
for top in [top5, top10]:
fig, ax = plt.subplots(2,5, figsize=(5*5,8))
for c, col in enumerate(['logn', 'lag_logn']):
for i, v in enumerate(top):
temp = matFlows[v]
temp.plot(column=col, ax=ax[c][i], legend=True, cmap='OrRd')
ax[c][i].axis('off')
ax[c][i].set_title(col + ' kg received for ' + v)
plt.show()
the spatial lag patterns for some of the material types seem a little strange - they look like misshapen doughnuts. This is a result of a high value surrounded by low or zero values. Perhaps in this case it would be more meaningful to calculate the distance-based spatial lag, rather than queen spatial lag.
Although these p-values in the previous paragraph are impressive, it isn't accurate because we've simplified the data into binary values, losing a lot of information in the process. In order to get a more accurate understanding, we will do the same process on our dataset with continuous values. However, with continuous values, we cannot use join counts to analyse the data, because there are an infinite number of join types between neighbors. So instead, we use another method to measure spatial clustering - Moran's I. (Here's a helpful video explaining it)
Moran's I ranges from -1 to 1. 1 means that the data is perfectly clustered; 0 means perfectly random; and -1 means perfectly dispersed.
from IPython.display import Image
print('Moran\'s I')
Image("../images/morans_i.png", width=600)
# make dictionary of moran's I
matMI = dict.fromkeys(topMat, None)
# calculate moran's I
for i, v in enumerate(topMat):
temp = matFlows[v]
y = temp['logn']
np.random.seed(12345)
mi = esda.moran.Moran(y, wq)
matMI[v] = [mi, round(mi.I, 2)]
print('Moran\'s I for {}: {}'.format(v, round(mi.I, 2)))
# histplot of Moran's I values
print('')
mis = pd.DataFrame.from_dict(matMI).T
mis.rename(columns={1: 'morans_I'}, inplace=True)
mis.drop(labels=[0], axis=1, inplace=True)
sn.histplot(data=mis, x='morans_I', bins=10)
plt.show()
The Moran's Is for the industry flows range between 0.03-0.24, which seems quite low. However, Moran's I cannot be interpreted at face value, because it is an inferential statistic (at least according to this website). We can only interpret this number by comparing it to the moran's I of random simulations, similar to the previous process with the binary data. In the figure below, the blue curve shows the distribution of Moran's I values in 999 simulations, the black line shows the average Moran's I value of the simulations, and the red line shows the Moran's I of our actual data.
In the figures below, we can see that the log(kg) values have a statistically significant spatial clustering (p-value = 0.001), while the spatial clustering of the normal (kg) values are not statistically significant (p-value = 0.088 > 0.05). This means that it would be worth conducting spatial analysis on the log(kg) values, but not on the normal kg values. We should also explore further the spatial clustering of different material types to see if those are statistically significant as well.
v = 'EWC-12.1'
mi = matMI[v]
print('Average Morans I for simulations: {}'.format(mi.EI))
for top in [top5, top10]:
fig, ax = plt.subplots(1,5, figsize=(6*5, 5))
for i,v in enumerate(top):
mi = matMI[v]
sn.kdeplot(mi.sim, shade=True, ax=ax[i]) # kde plot of moran's I for 999 simulations
ax[i].vlines(mi.I, 0, 40, color='r')
ax[i].vlines(mi.EI, 0, 40)
ax[i].set_xlabel("Moran's I")
ax[i].set_title("{} (p-value: {})".format(v, mi.p_sim))
plt.show()
Although all p-values are lower than 0.05, there is a chance that the spatial distribution of the manufacturing industry receivers is random - the red line overlaps the blue curve in the second graph. This means that it might not be worth analyzing the manufacturing industry flows, because there isn't a strong spatial pattern here. Again, the next step is either to refine the sbi code matching method to allow more flows to match with manufacturing sbi codes; or to categorize the flows using materials rather than industry type.
The previous analysis explained how spatially clustered the data is as a whole, or its 'global' spatial auto-correlation. Now, we will try to understand the local neighborhood conditions of each postcode, and identifying postcodes that are outliers. To get a better understanding of the data, we created a scatterplot that shows, for each postcode, the relationship for kg received and average kg received by its neighbors. The figure on the right shows a scatterplot of log(kg) values.
The scatterplot can be understood by separating it into four quadrants (the black dotted lines):
np.random.seed(12345)
for top in [top5, top10]:
fig, ax = plt.subplots(1,5, figsize=(8*5, 8))
for i,v in enumerate(top):
# make df and remove zero values
temp = matFlows[v]
temp = temp[temp.logn != 0]
# PREPARE VALUES
# create values for moran scatterplot of normal values
kg = temp['logn']
lag_kg = temp['lag_logn'] # average kg_received of neighbors
# polynomial of degree 1, aka draw a straight line that best fits the datapoints, where b, a is (y = bx + a)
b, a = np.polyfit(kg, lag_kg, 1)
# PLOT
# scatterplot of kg versus lagged kg
ax[i].plot(kg, lag_kg, '.', color='lightblue')
# red line of best fit using global I as slope
ax[i].plot(kg, b*kg+a, 'r')
ax[i].set_title('Moran Scatterplot for {} (Mi = {})'.format(v, round(matMI[v].I,3)))
ax[i].set_ylabel('Spatial Lag of log(kg)')
ax[i].set_xlabel('log(kg)')
# dashed lines at mean of the kg and lagged kg
ax[i].vlines(kg.mean(), lag_kg.min(), lag_kg.max(), linestyle='--')
ax[i].hlines(lag_kg.mean(), kg.min(), kg.max(), linestyle='--')
plt.show()
There seems to be barely any correlation between the amount of waste received by each postcode, and the amount received by its neighbors. This suggests that the spatial pattern is weak, especially for the manufacturing industry. If there was a strong spatial pattern, there should be more correlation between how much a pc receives and how much is received by its neighbor. And a strong spatial pattern would mean that kg received by a postcode has something to do with its location.
fixing disconnected components: https://geographicdata.science/book/notebooks/04_spatial_weights.html
# make spatial weight matrix with no islands
pcIslands = list(pc4.iloc[wq.islands].PC4) # identify island pcs
pc4ni = pc4[~pc4.PC4.isin(pcIslands)]
wq = lp.weights.Queen.from_dataframe(pc4ni)
wq.transform = 'r'
from matplotlib import colors
spotsDict = {
'hotspots': [1, 'red'],
'coldspots': [3, 'blue'],
'doughbuts': [2, 'blue'],
'diamonds': [4, 'red']
}
for top in (top5, top10):
for s in list(spotsDict.keys()):
f,ax = plt.subplots(1,5,figsize=(6*5,8), subplot_kw=dict(aspect='equal'))
for i, v in enumerate(top):
# make temp df
temp = matFlows[v]
temp = temp[~temp.PC4.isin(pcIslands)] # drop rows with islands
# calculate local moran's I for temp df
y = temp['logn']
li = esda.moran.Moran_Local(y, wq)
sig = li.p_sim < 0.05
n_sig = (li.p_sim < 0.05).sum()
# def plot spots
def plotSpots(spotType, color, i, title):
spots = ['n.sig.', title]
labels = [spots[i] for i in spotType*1] # why is it hotspot*1 and not just hotspot?
hmap = colors.ListedColormap([color, 'lightgrey'])
temp.assign(cl=labels).plot(column='cl', categorical=True,
k=2, cmap=hmap, linewidth=0.1, ax=ax[i], # how does it know what order to put the colors?
edgecolor='white', legend=True, legend_kwds={'loc': 'lower left', 'bbox_to_anchor':(0.,0.,0.,0.)})
ax[i].set_axis_off()
ax[i].set_title(title)
# plot spots
sig = li.p_sim < 0.05
spot = sig * li.q == spotsDict[s][0]
plotSpots(spotType=spot, color=spotsDict[s][1], i=i, title=v + ' ' + s)
plt.show()
from matplotlib import colors
hmap = colors.ListedColormap(['lightgrey', 'red', 'lightblue', 'blue', 'pink'])
for top in [top5, top10]:
f, ax = plt.subplots(1,5, figsize=(6*5,8))
for i, v in enumerate(top):
d = matFlows[v]
# calculate local Moran's I for each df
d = d[~d.PC4.isin(pcIslands)] # drop rows with islands
y = d['logn']
li = esda.moran.Moran_Local(y, wq)
n_sig = (li.p_sim < 0.05).sum()
# values of hotspots, coldspots...etc.
sig = 1 * (li.p_sim < 0.05)
hotspot = 1 * (sig * li.q==1)
coldspot = 3 * (sig * li.q==3)
doughnut = 2 * (sig * li.q==2)
diamond = 4 * (sig * li.q==4)
spots = hotspot + coldspot + doughnut + diamond
spot_labels = [ '0 ns', '1 hot spot', '2 doughnut', '3 cold spot', '4 diamond']
labels = [spot_labels[i] for i in spots]
# plot
d.assign(cl=labels).plot(column='cl', categorical=True, \
k=2, cmap=hmap, linewidth=0.1, ax=ax[i], \
edgecolor='white', legend=True, legend_kwds={'loc': 'lower left', 'bbox_to_anchor':(0.,0.,0.,0.)})
ax[i].set_axis_off()
ax[i].set_title(v)
plt.show()
Would be good to verify these hotspots on google maps. For example, if we see that a hotspot of the agriculture industry is right in the middle of an industrial area, we know that there is probably something wrong with the data. I'm asking this question because the hotspots for manufacturing industry receivers don't seem to be in major industrial areas.